Data Wrangling: Center for Disease Control (CDC) Social Vulnerability Index

Pa-Shun Hawkins & Robert J. Dellinger

Summer Research Project 2024

Introduction

This report documents how we brought the CDC/ATSDR Social Vulnerability Index (SVI) into our project so we can describe and compare the social conditions of space alongside environmental outcomes. The SVI summarizes 15 census/ACS indicators into four themes—(1) Socioeconomic Status, (2) Household Composition & Disability, (3) Minority Status & Language, and (4) Housing Type & Transportation—and provides percentile ranks that indicate how vulnerable a tract or county is relative to all others. We use the SVI for 2000, 2010, and 2020, at both the census-tract and county levels, and subset to our seven states of interest (IL, IN, KY, OH, PA, TN, WV). Our goal is to align these social metrics with our other datasets, maintain comparability across time, and export tidy, analysis-ready tables and maps.

Methodology

We use the CDC/ATSDR Social Vulnerability Index (SVI) releases for 2000, 2010, and 2020, downloaded at both the county and tract levels. All files, data dictionaries, and documentation come from the CDC/ATSDR SVI portal.

Setup Packages

Loading CDC SVI County Data (2000, 2010, 2020)

## -----------------------------
## 2020 COUNTIES
## -----------------------------

CDC_SVI_2020_county_dir <- here("Data", "CDC_Social_Vulnerability_Index", 
                                "2020_County_Data")

CDC_SVI_2020_county_files <- fs::dir_ls(CDC_SVI_2020_county_dir, 
                             regexp = "\\.csv$", recurse = TRUE)

CDC_SVI_2020_county <-
  map_dfr(CDC_SVI_2020_county_files, read_csv, show_col_types = FALSE) %>%
  clean_names() %>%
  mutate(
    year        = 2020,
    state_fips  = substr(stcnty, 1, 2),
    county_fips = substr(stcnty, 3, 5)
  ) %>%
  # CDC: treat -999 as missing, but ONLY for numeric columns
  mutate(across(where(is.numeric), ~na_if(.x, -999))) %>%
  dplyr::select(-st, -stcnty, -state, -county) %>%
  dplyr::select(st_abbr, state_fips, county_fips, everything())

## -----------------------------
## 2010 COUNTIES
## -----------------------------

CDC_SVI_2010_county_dir <- here("Data", "CDC_Social_Vulnerability_Index", 
                                "2010_County_Data")

CDC_SVI_2010_county_files <- fs::dir_ls(CDC_SVI_2010_county_dir, 
                             regexp = "\\.csv$", recurse = TRUE)

CDC_SVI_2010_county <-
  map_dfr(CDC_SVI_2010_county_files, read_csv, show_col_types = FALSE) %>%
  clean_names() %>%
  mutate(
    year        = 2010,
    state_fips  = substr(fips, 1, 2),
    county_fips = substr(fips, 3, 5)
  )  %>%
  # CDC: treat -999 as missing, but ONLY for numeric columns
  mutate(across(where(is.numeric), ~na_if(.x, -999))) %>%
  dplyr::select(-state, -fips) %>% 
  dplyr::select(st_abbr=st, state_fips, county_fips, everything())


## -----------------------------
## 2000 COUNTIES
## -----------------------------

CDC_SVI_2000_county_dir <- here("Data", "CDC_Social_Vulnerability_Index", 
                                "2000_County_Data")

CDC_SVI_2000_county_files <- fs::dir_ls(CDC_SVI_2000_county_dir,
                             regexp = "\\.csv$", recurse = TRUE)
CDC_SVI_2000_county <-
  map_dfr(CDC_SVI_2000_county_files, read_csv, show_col_types = FALSE) %>%
  clean_names() %>%
  mutate(
    year        = 2000,
    county_fips = as.character(cnty_fips),
    state_fips = as.character(state_fips),
  )   %>%
  # CDC: treat -999 as missing, but ONLY for numeric columns
  mutate(across(where(is.numeric), ~na_if(.x, -999))) %>%
  dplyr::select(-county, -stcofips, -cnty_fips) %>% 
  dplyr::select(st_abbr=state_abbr, state_fips, county_fips, everything())

#print(CDC_SVI_2020_county)
#print(CDC_SVI_2010_county)
#print(CDC_SVI_2000_county)
library(dplyr)
library(janitor)
library(gt)
library(scales)

# Helper to convert a 0–1 percentile to CDC-style quartile labels
quartile_lab <- function(x) {
  cut(
    x,
    breaks = c(-Inf, 0.25, 0.50, 0.75, Inf),
    labels = c(
      "0–0.25 (Least)",
      "0.25–0.50",
      "0.50–0.75",
      "0.75–1.00 (Most)"
    ),
    right = TRUE
  )
}

county_2020_ranked <-
  CDC_SVI_2020_county %>%
  clean_names() %>%
  filter(rpl_themes >= 0, rpl_themes <= 1) %>%  # <-- drop -999 and any stray
  mutate(
    geoid   = sprintf("%02s%03s", state_fips, county_fips),
    svi_nat = rpl_themes
  ) %>%
  group_by(st_abbr) %>%
  mutate(svi_state = percent_rank(svi_nat)) %>%  # re-rank within state
  ungroup() %>%
  mutate(
    nat_quart   = quartile_lab(svi_nat),
    state_quart = quartile_lab(svi_state)
  ) %>%
  dplyr::select(st_abbr, county_fips, location, svi_nat, svi_state, nat_quart, state_quart)

# Show the 25 highest national SVI counties
county_2020_ranked %>%
  dplyr::slice_max(order_by = svi_nat, n = 25, with_ties = FALSE) %>%
  gt::gt() %>%
  gt::fmt_number(columns = c(svi_nat, svi_state), decimals = 3) %>%
  gt::cols_label(
    st_abbr     = "State",
    county_fips = "County FIPS",
    location    = "County",
    svi_state   = "Overall SVI (state-based)",
    state_quart = "State quartile"
  ) %>%
  gt::tab_header(
    title    = gt::md("**Between-county Differences in Overall SVI (2020)**"),
    subtitle = gt::md("National vs. within-state percentiles & CDC-style quartiles")
  ) %>%
  gt::tab_source_note(
    gt::md("Notes: `RPL_THEMES` is the CDC national percentile rank (0–1). `svi_state` is re-ranked within state (0–1)..") 
  )
Between-county Differences in Overall SVI (2020)
National vs. within-state percentiles & CDC-style quartiles
State County FIPS County svi_nat Overall SVI (state-based) nat_quart State quartile
IL 003 Alexander County, Illinois 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
IN 097 Marion County, Indiana 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
KY 075 Fulton County, Kentucky 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
OH 007 Ashtabula County, Ohio 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
PA 101 Philadelphia County, Pennsylvania 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
TN 097 Lauderdale County, Tennessee 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
WV 055 Mercer County, West Virginia 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
KY 047 Christian County, Kentucky 0.992 0.992 0.75–1.00 (Most) 0.75–1.00 (Most)
IL 165 Saline County, Illinois 0.990 0.990 0.75–1.00 (Most) 0.75–1.00 (Most)
TN 095 Lake County, Tennessee 0.989 0.989 0.75–1.00 (Most) 0.75–1.00 (Most)
IN 039 Elkhart County, Indiana 0.989 0.989 0.75–1.00 (Most) 0.75–1.00 (Most)
OH 023 Clark County, Ohio 0.989 0.989 0.75–1.00 (Most) 0.75–1.00 (Most)
PA 077 Lehigh County, Pennsylvania 0.985 0.985 0.75–1.00 (Most) 0.75–1.00 (Most)
KY 095 Harlan County, Kentucky 0.983 0.983 0.75–1.00 (Most) 0.75–1.00 (Most)
WV 047 McDowell County, West Virginia 0.982 0.981 0.75–1.00 (Most) 0.75–1.00 (Most)
IL 201 Winnebago County, Illinois 0.980 0.980 0.75–1.00 (Most) 0.75–1.00 (Most)
TN 061 Grundy County, Tennessee 0.979 0.979 0.75–1.00 (Most) 0.75–1.00 (Most)
IN 041 Fayette County, Indiana 0.978 0.978 0.75–1.00 (Most) 0.75–1.00 (Most)
OH 113 Montgomery County, Ohio 0.977 0.977 0.75–1.00 (Most) 0.75–1.00 (Most)
KY 065 Estill County, Kentucky 0.975 0.975 0.75–1.00 (Most) 0.75–1.00 (Most)
IL 077 Jackson County, Illinois 0.970 0.970 0.75–1.00 (Most) 0.75–1.00 (Most)
PA 049 Erie County, Pennsylvania 0.970 0.970 0.75–1.00 (Most) 0.75–1.00 (Most)
TN 069 Hardeman County, Tennessee 0.968 0.968 0.75–1.00 (Most) 0.75–1.00 (Most)
IN 177 Wayne County, Indiana 0.967 0.967 0.75–1.00 (Most) 0.75–1.00 (Most)
KY 051 Clay County, Kentucky 0.966 0.966 0.75–1.00 (Most) 0.75–1.00 (Most)
Notes: RPL_THEMES is the CDC national percentile rank (0–1). svi_state is re-ranked within state (0–1)..

Loading CDC SVI County Data (2000, 2010, 2020)

## -----------------------------
## 2020 TRACTS
## -----------------------------
CDC_SVI_2020_census_tract_dir <- here("Data","CDC_Social_Vulnerability_Index","2020_Census_Tract_Data")

CDC_SVI_2020_census_tract_files <- fs::dir_ls(CDC_SVI_2020_census_tract_dir, 
                                              regexp = "\\.csv$", recurse = TRUE)

CDC_SVI_2020_census_tract <-
  map_dfr(CDC_SVI_2020_census_tract_files, read_csv, show_col_types = FALSE) %>%
  clean_names() %>%
  mutate(
    year        = 2020,
    state_fips  = substr(stcnty, 1, 2),
    county_fips = substr(stcnty, 3, 5),
    state_fips  = sprintf("%02s", state_fips),
    county_fips = sprintf("%03s", county_fips),
    geoid       = if ("fips" %in% names(.)) sprintf("%011.0f", fips) else paste0(state_fips, county_fips, tract),
    st_abbr     = st_abbr %||% state,   # just in case
    state       = state %||% state_name # some files have both/one
  )  %>%
  # CDC: treat -999 as missing, but ONLY for numeric columns
  mutate(across(where(is.numeric), ~na_if(.x, -999))) %>% 
  dplyr::select(state, st_abbr, state_fips, county, county_fips, geoid, year, everything(), -st, -stcnty)

## -----------------------------
## 2010 TRACTS
## -----------------------------
CDC_SVI_2010_census_tract_dir <- here("Data","CDC_Social_Vulnerability_Index","2010_Census_Tract_Data")

CDC_SVI_2010_census_tract_files <- fs::dir_ls(CDC_SVI_2010_census_tract_dir, 
                                              regexp = "\\.csv$", recurse = TRUE)

CDC_SVI_2010_census_tract <-
  map_dfr(CDC_SVI_2010_census_tract_files, read_csv, show_col_types = FALSE) %>%
  clean_names() %>%
  mutate(
    year        = 2010,
    state_fips  = sprintf("%02s", state_fips),
    county_fips = sprintf("%03s", cnty_fips),
    geoid       = if ("fips" %in% names(.)) sprintf("%011.0f", fips) else paste0(state_fips, county_fips, tract),
    st_abbr     = state_abbr,
    state       = state_name
  )  %>%
  # CDC: treat -999 as missing, but ONLY for numeric columns
  mutate(across(where(is.numeric), ~na_if(.x, -999))) %>%
  dplyr::select(state, st_abbr, state_fips, county, county_fips, geoid, year, everything(), 
                -stcofips, -cnty_fips)

## -----------------------------
## 2000 TRACTS
## -----------------------------
CDC_SVI_2000_census_tract_dir <- here("Data","CDC_Social_Vulnerability_Index","2000_Census_Tract_Data")

CDC_SVI_2000_census_tract_files <- fs::dir_ls(CDC_SVI_2000_census_tract_dir,
                                              regexp = "\\.csv$", recurse = TRUE)

CDC_SVI_2000_census_tract <-
  map_dfr(CDC_SVI_2000_census_tract_files, read_csv, show_col_types = FALSE) %>%
  clean_names() %>%
  mutate(
    year        = 2000,
    state_fips  = sprintf("%02d", state_fips),
    county_fips = sprintf("%03s", cnty_fips),
    geoid       = paste0(state_fips, county_fips, tract),
    st_abbr     = state_abbr,
    state       = state_name,
    county      = county
  ) %>%
  dplyr::select(state, st_abbr, state_fips, county, county_fips, geoid, year, everything(),
                -stcofips, -cnty_fips)

#print(CDC_SVI_2020_census_tract)
#print(CDC_SVI_2010_census_tract)
#print(CDC_SVI_2000_census_tract)
tract_2020_ranked <-
  CDC_SVI_2020_census_tract %>%
  clean_names() %>%
  # keep only valid overall SVI percentiles
  filter(!is.na(rpl_themes), dplyr::between(rpl_themes, 0, 1)) %>%
  mutate(
    st_abbr = coalesce(st_abbr, state),
    svi_nat = rpl_themes                     # national percentile (given by CDC)
    # geoid already exists from your read-in code, so we don't touch it
  ) %>%
  group_by(st_abbr) %>%
  mutate(svi_state = percent_rank(svi_nat)) %>%
  ungroup() %>%
  mutate(
    nat_quart   = quartile_lab(svi_nat),
    state_quart = quartile_lab(svi_state)
  ) %>%
  dplyr::select(st_abbr, county, location,
         svi_nat, svi_state, nat_quart, state_quart)

# Show the 25 most vulnerable tracts nationally
tract_2020_ranked %>%
  arrange(desc(svi_nat)) %>%
  slice_head(n = 25) %>%
  gt() %>%
  fmt_number(columns = c(svi_nat, svi_state), decimals = 3) %>%
  cols_label(
    st_abbr     = "State",
    county      = "County",
    location    = "Location",
    svi_nat     = "Overall SVI (national)",
    svi_state   = "Overall SVI (state-based)",
    nat_quart   = "Nat. quartile",
    state_quart = "State quartile"
  ) %>%
  tab_header(
    title    = md("**Between-tract Differences in Overall SVI (2020)**"),
    subtitle = md("National vs. within-state percentiles & CDC-style quartiles")
  ) %>%
  tab_source_note(
    md("Notes: `RPL_THEMES` is the CDC national percentile rank (0–1). `svi_state` is re-ranked within state (0–1).")
  )
Between-tract Differences in Overall SVI (2020)
National vs. within-state percentiles & CDC-style quartiles
State County Location Overall SVI (national) Overall SVI (state-based) Nat. quartile State quartile
IL Lake Census Tract 8623, Lake County, Illinois 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
IN Lake Census Tract 310, Lake County, Indiana 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
KY Jefferson Census Tract 30, Jefferson County, Kentucky 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
OH Franklin Census Tract 51, Franklin County, Ohio 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
PA Berks Census Tract 25, Berks County, Pennsylvania 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
TN Shelby Census Tract 105, Shelby County, Tennessee 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
WV Monongalia Census Tract 101.03, Monongalia County, West Virginia 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
IL Cook Census Tract 2603, Cook County, Illinois 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
OH Franklin Census Tract 26, Franklin County, Ohio 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
PA York Census Tract 12, York County, Pennsylvania 1.000 1.000 0.75–1.00 (Most) 0.75–1.00 (Most)
IL Cook Census Tract 8386, Cook County, Illinois 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
IN Elkhart Census Tract 26, Elkhart County, Indiana 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
OH Mahoning Census Tract 8141, Mahoning County, Ohio 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
PA Allegheny Census Tract 1209, Allegheny County, Pennsylvania 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
TN Shelby Census Tract 8, Shelby County, Tennessee 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
KY Boone Census Tract 703.01, Boone County, Kentucky 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
IL Lake Census Tract 8626.05, Lake County, Illinois 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
PA Philadelphia Census Tract 176.01, Philadelphia County, Pennsylvania 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
OH Cuyahoga Census Tract 1976, Cuyahoga County, Ohio 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
IL Cook Census Tract 3016, Cook County, Illinois 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
IN Jackson Census Tract 9679.01, Jackson County, Indiana 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
PA Erie Census Tract 13, Erie County, Pennsylvania 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
TN Hamblen Census Tract 1001, Hamblen County, Tennessee 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
OH Hamilton Census Tract 68, Hamilton County, Ohio 0.999 0.999 0.75–1.00 (Most) 0.75–1.00 (Most)
IL Cook Census Tract 8293.02, Cook County, Illinois 0.999 0.998 0.75–1.00 (Most) 0.75–1.00 (Most)
Notes: RPL_THEMES is the CDC national percentile rank (0–1). svi_state is re-ranked within state (0–1).

Mapping SVI by County (2020)

states_of_interest <- c("IL","IN","KY","OH","PA","TN","WV")


cdc_cols <- c("#c8e6c9", "#fff9c4", "#ffcc80", "#ef5350")

# ----------------------------
# 1) County polygons
# ----------------------------
counties_sf <- tigris::counties(cb = TRUE, year = 2020, class = "sf") %>%
  janitor::clean_names() %>%                      # clean first
  dplyr::filter(stusps %in% states_of_interest) %>%
  dplyr::mutate(
    state_fips  = statefp,                        # the 2-digit FIPS
    county_fips = countyfp,                       # the 3-digit FIPS
    geoid       = sprintf("%02s%03s", state_fips, county_fips)
  ) %>%
  sf::st_transform(4326) 

# keep just the columns we need, make a 5-digit county geoid
county_2020 <- CDC_SVI_2020_county %>%
  dplyr::filter(year == 2020) %>%
  dplyr::mutate(geoid = sprintf("%02s%03s", state_fips, county_fips)) %>%
  dplyr::select(geoid, state=st_abbr, county=county_fips, year, rpl_themes)

county_2020_map <- counties_sf %>%
  dplyr::left_join(county_2020, by = "geoid") %>%
  dplyr::mutate(rpl_quint = quartile_lab(rpl_themes))

p_svi_2020 <- ggplot(county_2020_map) +
  geom_sf(aes(fill = rpl_quint), color = NA) +
  scale_fill_manual(values = cdc_cols, drop = FALSE, name = "Overall SVI\n(percentile)") +
  labs(
    title    = "CDC Social Vulnerability Index (Overall, 2020)",
    subtitle = "Classed into CDC-style quintile bins",
    caption  = "Source: CDC SVI 2020 • Projection: WGS84 / EPSG:4326"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    legend.position   = "bottom",
    panel.grid        = element_blank(),
    axis.title        = element_blank(),
    panel.background  = element_rect(fill = NA, colour = NA),
    plot.background   = element_rect(fill = NA, colour = NA),
    legend.background = element_rect(fill = NA, colour = NA)
  )
print(p_svi_2020)

ggsave(
  here::here("Output", "Figures", "CDC_SVI_overall_2020_county_quintiles.png"),
  p_svi_2020, width = 7.5, height = 6, dpi = 600
)

#  Fetch 2020 tract geometries for our states
tracts_sf <- map_dfr(
  states_of_interest,
  ~ tigris::tracts(cb = TRUE, year = 2020, state = .x, class = "sf")
) %>%
  janitor::clean_names() %>%
  mutate(
    state_fips  = statefp,                     # 2-digit FIPS
    county_fips = countyfp,                    # 3-digit FIPS
    tract_code  = tractce,                     # 6-digit tract code
    geoid       = paste0(state_fips, county_fips, tract_code)
  ) %>%
  sf::st_transform(4326)

# Prepare the 2020 SVI tract data
tract_2020 <- CDC_SVI_2020_census_tract %>%
  filter(year == 2020) %>%
  # treat invalid percentiles as NA
  filter(!is.na(rpl_themes), between(rpl_themes, 0, 1)) %>%
  mutate(
    geoid     = geoid,                   # already created on import
    rpl_quart = quartile_lab(rpl_themes)
  ) %>%
  dplyr::select(geoid, rpl_quart)

# Join and map
tract_2020_map <- tracts_sf %>%
  left_join(tract_2020, by = "geoid")

ggplot(tract_2020_map %>% na.omit(rpl_quart)) +
  geom_sf(aes(fill = rpl_quart), color = NA) +
  scale_fill_manual(
    values    = cdc_cols,
    na.value  = "grey95",
    drop      = FALSE,
    name      = "Overall SVI\n(quartile)"
  ) +
  labs(
    title    = "CDC Social Vulnerability Index (2020)",
    subtitle = "Census tracts classed into CDC-style quartiles",
    caption  = "Source: CDC SVI 2020 • Projection: WGS84 / EPSG:4326"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    legend.position   = "bottom",
    panel.grid        = element_blank(),
    axis.title        = element_blank(),
    panel.background  = element_rect(fill = NA, colour = NA)
  )

# Save
ggsave(
  here::here("Output", "Figures", "CDC_SVI_overall_2020_tract_quartiles.png"),
  width = 7.5, height = 6, dpi = 600
)

Citations

Centers for Disease Control and Prevention/ Agency for Toxic Substances and Disease Registry/ Geospatial Research, Analysis, and Services Program. CDC/ATSDR Social Vulnerability Index [2020, 2010, & 2000] Database